{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "0c588760",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import librosa\n",
    "import glob\n",
    "import numpy as np\n",
    "import xgboost as xgb"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e41b08d3",
   "metadata": {},
   "source": [
    "# 处理训练集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "850a8554",
   "metadata": {},
   "outputs": [],
   "source": [
    "path = '初赛训练集\\\\cough'\n",
    "fea = []\n",
    "label = []\n",
    "for i in os.listdir(path):\n",
    "    for j in os.listdir(label_path+'\\\\'+i):\n",
    "        y, sr = librosa.load(path=label_path+'\\\\'+i+'\\\\'+j, sr=None, mono=False)\n",
    "        y = y[::3]\n",
    "        # 默认提取 20 帧\n",
    "        audio_mac = librosa.feature.mfcc(y=y, sr=16000)\n",
    "        y_shape = audio_mac.shape[1]\n",
    "        max_pad_size=11\n",
    "        if y_shape < max_pad_size:\n",
    "            pad_size = max_pad_size - y_shape\n",
    "            audio_mac = np.pad(audio_mac, ((0, 0), (0, pad_size)), mode='constant')\n",
    "        else:\n",
    "            audio_mac = audio_mac[:, :max_pad_size]\n",
    "        fea.append(audio_mac.flatten())\n",
    "        label.append(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "34a055db",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(fea)\n",
    "df['label'] = label\n",
    "fea_names = [i for i in df.columns if i not in ['label']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "b244421a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[23:14:32] WARNING: ..\\src\\learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
       "              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,\n",
       "              importance_type='gain', interaction_constraints='',\n",
       "              learning_rate=0.300000012, max_delta_step=0, max_depth=6,\n",
       "              min_child_weight=1, missing=nan, monotone_constraints='()',\n",
       "              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,\n",
       "              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,\n",
       "              tree_method='exact', validate_parameters=1, verbosity=None)"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = xgb.XGBClassifier()\n",
    "model.fit(df[fea_names],df['label'])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d15c937f",
   "metadata": {},
   "source": [
    "# 处理测试集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "4540e914",
   "metadata": {},
   "outputs": [],
   "source": [
    "path = '初赛测试集'\n",
    "fea = []\n",
    "files = []\n",
    "for j in os.listdir(path):\n",
    "    files.append(j)\n",
    "    y, sr = librosa.load(path=path+'\\\\'+j, sr=None, mono=False)\n",
    "    y = y[::3]\n",
    "    # 默认提取 20 帧\n",
    "    audio_mac = librosa.feature.mfcc(y=y, sr=16000)\n",
    "    y_shape = audio_mac.shape[1]\n",
    "    max_pad_size=11\n",
    "    if y_shape < max_pad_size:\n",
    "        pad_size = max_pad_size - y_shape\n",
    "        audio_mac = np.pad(audio_mac, ((0, 0), (0, pad_size)), mode='constant')\n",
    "    else:\n",
    "        audio_mac = audio_mac[:, :max_pad_size]\n",
    "    fea.append(audio_mac.flatten())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "423e324b",
   "metadata": {},
   "outputs": [],
   "source": [
    "df2 = pd.DataFrame(fea)\n",
    "df2['category_id'] = model.predict(df2[fea_names])\n",
    "df2['category_id'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "id": "64bc86d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "df2['sample_id']=files\n",
    "df2[['sample_id','category_id']].to_csv('sub.csv',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "127bfc2b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 使用keras或其他方法\n",
    "# df['label'] = df['label'].apply(lambda x:0 if x=='Negative' else 1)\n",
    "\n",
    "# from sklearn.model_selection import train_test_split\n",
    "# x_train, x_val, y_train, y_val = train_test_split(df[fea_names], df['label'], test_size=0.2, random_state=42)\n",
    "\n",
    "# from keras.models import Sequential\n",
    "# from keras.layers import Dense\n",
    "# import keras\n",
    "\n",
    "# model = Sequential()\n",
    "# model.add(Dense(64, activation='relu', input_shape=(220,)))\n",
    "# model.add(Dense(64, activation='relu'))\n",
    "# model.add(Dense(64, activation='relu'))\n",
    "# model.add(Dense(1, activation='softmax'))\n",
    "\n",
    "# model.compile(loss=keras.losses.categorical_crossentropy,\n",
    "#               optimizer=keras.optimizers.RMSprop(),\n",
    "#               metrics=['accuracy'])\n",
    "# model.fit(x_train, y_train, batch_size=30, epochs=20, verbose=1,validation_data=(x_val, y_val))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
